Import libraries
source("R/bioage_estimate_median.R")
source("R/gompertz_draw.R")
source("R/weibull_draw.R")
source("R/generate_data.R")
library(tidyverse)
library(MASS)
library(psbcGroup)
Contents “generate_data.R” file
Wrapper
generate_data = function(pop_n, obs_n, p, g, ltname, betafrac, seed, force_recalc, X_plots, a = exp(-9), b = 0.085, X_rho, β_rho, non_zero_betas, X_scale, active_hazard, betasep, bscale) {
print("Generating βs...")
betas = generate_betas(p = p, g=g, rho = β_rho, rho_between = 0,
seed = seed, mu_u = betasep, mu_l = -betasep, beta_scale = bscale,
beta_denom = betafrac, plot = T, non_zero_groups = non_zero_betas, active_hazard = active_hazard)
# betas$beta = betas$beta + 0.01
print("Generating X...")
X = generate_X(n = pop_n, p = p, g = g, rho = X_rho, seed = seed,
rho_between = 0,
X_plots = X_plots, scale = X_scale)
print("Done!")
print("Generating population lifetable...")
lt = generate_population_lifetable(N_pop = pop_n, M = p, betas = betas$beta,
X = X$X, filename = ltname,
seed = seed, force_recalc = force_recalc, a=a, b=b)
print("Done!")
plot = ggplot(lt, aes(x = t, y = mrl)) +
geom_line() +
labs(
title = "Population Lifetable MRL",
x = "Chronological age (years)",
y = "Mean Residual Life"
) +
theme_minimal()
print(plot)
print("Generating data from lifetable...")
df_sim = create_dataset(M = p, n_obs = obs_n, G = g,
gsize = (p/g), lt = lt, betas = betas$beta,
seednr = seed+1, X_plots = F, a=a, b=b, X_rho = X_rho,
X_scale = X_scale, followup = 20)
return(list(df = df_sim, true_betas = betas))
}
Function performance
source("R/generate_data.R")
aft_reg = function(df_sim, p) {
true_betas = df_sim$true_betas
df_sim = df_sim$df
# Train/test split 50/50
train_indices = sample(1:nrow(df_sim), nrow(df_sim)/2)
df_sim_train = df_sim[train_indices,]
df_sim_test = df_sim[-train_indices,]
# Create the formula for AFT regression
predictor_vars = paste(c(glue("x{1:p}")), collapse = " + ")
aft_formula = as.formula(paste("Surv(age_start, age_end, status) ~", predictor_vars))
# Fit the AFT model
fit_aft = tryCatch({
fit_aft = eha::aftreg(
formula = aft_formula,
data = df_sim_train,
dist = "gompertz",
control = list(
maxit = 100,
trace = FALSE
)
)
fit_aft
},
error = function(e) {
message("AFT model error: ", e$message)
NA
})
# If model fitting failed, return NA
if (is.na(fit_aft)[1]) {
return(NA)
}
# get estimated parameters
sigma_est <- exp(fit_aft$coefficients["log(scale)"]) # canonical parametrization
tau_est <- exp(fit_aft$coefficients["log(shape)"]) # canonical parametrization
a_est <- tau_est / sigma_est
b_est <- 1 / sigma_est
betas_est_aft <- fit_aft$coefficients[1:p]
linpred_aft <- rowSums(sweep(df_sim_test[,1:p], 2, betas_est_aft, "*")) # SCALE
for (i in 1:nrow(df_sim_test)){
mrl_uncon <- integrate(gomp_baseline_surv, lower = (df_sim_test$age_start[i] * exp(linpred_aft[i])),
upper=Inf, a = a_est, b = b_est)$value
s_cond <- gomp_baseline_surv(df_sim_test$age_start[i] * exp(linpred_aft[i]), a = a_est, b = b_est)
df_sim_test$pred_mrl[i] <- (mrl_uncon / s_cond) * exp(-linpred_aft[i])
}
plt = df_sim_test %>% ggplot(aes(x = mrl, y = pred_mrl)) +
geom_point() +
geom_abline(color = 'red') +
theme_minimal()
print(plt)
# Get the bias
aftreg_coef_rmse = sqrt(mean((betas_est_aft - true_betas$beta)^2))
# Get the RMSE
aftreg_rmse = sqrt(mean((df_sim_test$mrl - df_sim_test$pred_mrl)^2))
# Plot difference between true and estimates betas
true_betas$betahat = betas_est_aft
plt = true_betas %>%
mutate(index = 1:p) %>%
ggplot(aes(x = index)) +
# Vertical lines between pred and actual
geom_segment(aes(xend = index, y = beta, yend = betahat), color = "gray") +
geom_point(aes(y = beta, color = "True")) +
geom_point(aes(y = betahat, color = "Predicted")) +
scale_color_manual(values = c("True" = "black", "Predicted" = "skyblue"),
name = "Type") +
ylab("β") +
xlab("Index") +
labs(title = "True vs Predicted β-coefficients for AFT", subtitle = glue("RMSE = {aftreg_coef_rmse %>% round(4)}")) +
theme_minimal()
print(plt)
return(list(prediction = aftreg_rmse, coefficients = aftreg_coef_rmse))
}
psbc_reg = function(df_sim, p, g) {
true_betas = df_sim$true_betas
df_sim = df_sim$df
# Train/test split 50/50
train_indices = sample(1:nrow(df_sim), nrow(df_sim)/2)
df_sim_train = df_sim[train_indices,]
df_sim_test = df_sim[-train_indices,]
Y_train = cbind(df_sim_train$age_start, df_sim_train$age_end, df_sim_train$status)
Y_test = cbind(df_sim_test$age_start, df_sim_test$age_end, df_sim_test$status)
# Create covariate matrix X
X_train = as.matrix(df_sim_train[, paste0("x", 1:p)])
X_test = as.matrix(df_sim_test[, paste0("x", 1:p)])
XC = NULL # CONFOUNDERS
# Create group indicator for each variable
grpInx = rep(1:g, each = p/g)
# Set hyperparameters (PRIOR)?
hyperParams = list(
a.sigSq = 0.01, b.sigSq = 0.01,
mu0 = 0, h0 = 0.01,
v = 0.1
)
# Set starting values https://cran.r-project.org/web/packages/psbcGroup/psbcGroup.pdf
w = log(Y_train[,1])
mu = 0.1
beta = rep(0.1, p)
sigSq = 0.5
tauSq = rep(0.4, p)
lambdaSq = 10
betaC = rep(0.11, 0) # no conf
startValues = list(w=w, beta=beta, tauSq=tauSq, mu=mu, sigSq=sigSq,
lambdaSq=lambdaSq, betaC=betaC)
# MCMC parameters format __ CHANGE __
mcmcParams = list(
run = list(
numReps = 5000,
thin = 5, # Update params every n steps
burninPerc = 0.2 # Discard the first ... steps
),
tuning = list(
mu.prop.var = 0.1,
sigSq.prop.var = 0.005,
L.beC = 50,
M.beC = 1,
eps.beC = 0.001,
L.be = 100,
M.be = 1,
eps.be = 0.001
)
)
fit_bayes_aft = aftGL_LT(Y_train, X_train, XC, grpInx, hyperParams, startValues, mcmcParams)
# Extract betas using the maximum likelihood via density
get_mode = function(x){
xdist = density(x)
mode = xdist$x[which.max(xdist$y)]
return(mode)
}
betas = fit_bayes_aft$beta.p %>% apply(MARGIN = 2, FUN = get_mode)
# Make linear predictor for MRL
X_test = X_test %>% as.matrix
linpred = X_test %*% betas
# Change to lognormal
# Create baseline lognormal survival
# Implemented in R -> check log scales
# dlnorm, use any of them 1 - F(x)
# instead of gomp_baseline_surv no false, 1-
# a = exp(-9)
# b = 0.085
# 1-plnorm(1:100, meanlog = fit_bayes_aft$mu.p[120], sdlog = fit_bayes_aft$sigSq.p[120])
# Estimate mean from MCMC draws
mu_est = mean(fit_bayes_aft$mu.p)
sigSq_est = mean(fit_bayes_aft$sigSq.p)
for (i in 1:nrow(df_sim_test)){
mrl_uncon = integrate(function(x) 1 - plnorm(x, meanlog = mu_est, sdlog = sqrt(sigSq_est)), # Var to Sd via sqrt
lower = (df_sim_test$age_start[i] * exp(linpred[i])),
upper = Inf)$value
s_cond = 1 - plnorm(df_sim_test$age_start[i] * exp(linpred[i]), meanlog = mu_est, sdlog = sqrt(sigSq_est))
df_sim_test$pred_mrl[i] <- (mrl_uncon / s_cond) * exp(-linpred[i])
}
plt = df_sim_test %>% ggplot(aes(x = mrl, y = pred_mrl)) +
geom_point() +
geom_abline(color = 'red') +
theme_minimal()
print(plt)
# Get the RMSE
pred_RMSE = sqrt(mean((df_sim_test$mrl - df_sim_test$pred_mrl)^2))
beta_RMSE =sqrt(mean((betas - true_betas$beta)^2))
# Plot difference between true and estimates betas
true_betas$betahat = betas
plt = true_betas %>%
mutate(index = 1:p) %>%
ggplot(aes(x = index)) +
# Vertical lines between pred and actual
geom_segment(aes(xend = index, y = beta, yend = betahat), color = "gray") +
geom_point(aes(y = beta, color = "True")) +
geom_point(aes(y = betahat, color = "Predicted")) +
scale_color_manual(values = c("True" = "black", "Predicted" = "skyblue"),
name = "Type") +
ylab("β") +
xlab("Index") +
labs(title = "True vs Predicted β-coefficients for Bayesian AFT", subtitle = glue("RMSE = {beta_RMSE %>% round(4)}")) +
theme_minimal()
print(plt)
return(list(prediction = pred_RMSE, coefficients = beta_RMSE))
}
run_experiment = function(pop_n = 20000, obs_n = 1000, p = 20, g = 4, ltname, bfrac, seed, nzb, bsep, X_rho) {
print("<<<<GENERATING SIMULATED DATA>>>>")
df_sim = generate_data(pop_n = pop_n, obs_n = obs_n, p = p, g = g,
ltname = ltname, betafrac = bfrac, seed = 123,
force_recalc = F, X_plots = T, a = exp(-9), b = 0.085,
X_rho = X_rho, β_rho = 0.9, non_zero_betas = nzb, X_scale = 1,
#active_hazard = 0.025, betasep = 2)
active_hazard = (0.05*sqrt(p))/nzb/bfrac/bsep,
betasep = bsep, bscale = 0.25)
set.seed(seed+1)
print("<<<<RUNNING AFT>>>>")
rmse_aft = aft_reg(df_sim = df_sim, p = p)
set.seed(seed+1)
print("<<<<RUNNING PSBC BAYESIAN>>>>")
rmse_psbc = psbc_reg(df_sim = df_sim, p = p, g = g)
return(list(aft = rmse_aft, psbc = rmse_psbc))
}
Experiments
# p = 20
rmses = run_experiment(p = 20, g = 4, ltname = "p20_may20", bfrac = 40, seed = 42, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.215071322236047
## Lower range of beta values pre-scaling: -0.825018359139732
## Upper range of beta values pre-scaling: 1.69028592387452



## [1] "Generating X..."
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
## Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.




## [1] "Done!"
## [1] "Generating population lifetable..."
## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 791 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 791
## Range of linpred values: -0.1849455 0.2802615
## [1] "<<<<RUNNING AFT>>>>"


## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"
## iteration: 1000: Wed May 28 12:04:41 2025
##
## iteration: 2000: Wed May 28 12:05:08 2025
##
## iteration: 3000: Wed May 28 12:05:35 2025
##
## iteration: 4000: Wed May 28 12:06:01 2025
##
## iteration: 5000: Wed May 28 12:06:28 2025


print(rmses)
## $aft
## $aft$prediction
## [1] 4.485843
##
## $aft$coefficients
## [1] 0.01431187
##
##
## $psbc
## $psbc$prediction
## [1] 15.11173
##
## $psbc$coefficients
## [1] 0.01585668
rmses = run_experiment(p = 60, g = 4, ltname = "p60_may20", bfrac = 40, seed = 43, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.0701313093887515
## Lower range of beta values pre-scaling: -0.68035102718843
## Upper range of beta values pre-scaling: 1.18615518586129



## [1] "Generating X..."




## [1] "Done!"
## [1] "Generating population lifetable..."
## Range of death: 0.432065781729277
## Range of death: 127.549335244695

## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 809 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 809
## Range of linpred values: -0.2904009 0.2777132
## [1] "<<<<RUNNING AFT>>>>"


## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"
## iteration: 1000: Wed May 28 12:07:25 2025
##
## iteration: 2000: Wed May 28 12:08:11 2025
##
## iteration: 3000: Wed May 28 12:08:59 2025
##
## iteration: 4000: Wed May 28 12:09:47 2025
##
## iteration: 5000: Wed May 28 12:10:34 2025


print(rmses)
## $aft
## $aft$prediction
## [1] 10.44082
##
## $aft$coefficients
## [1] 0.01816652
##
##
## $psbc
## $psbc$prediction
## [1] 16.92182
##
## $psbc$coefficients
## [1] 0.01612463
#
# rmses = run_experiment(p = 100, g = 4, ltname = "p100_may20", bfrac = 40, seed = 44, nzb = 0.5, bsep = 2, X_rho = 0)
# print(rmses)
#
rmses = run_experiment(p = 200, g = 4, ltname = "p200_may20", bfrac = 40, seed = 45, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.148329628219689
## Lower range of beta values pre-scaling: -1.13669894797518
## Upper range of beta values pre-scaling: 1.65937506505303



## [1] "Generating X..."




## [1] "Done!"
## [1] "Generating population lifetable..."
## Range of death: 0.588888986973458
## Range of death: 150

## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 767 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 767
## Range of linpred values: -0.8091164 1.047817
## [1] "<<<<RUNNING AFT>>>>"
## AFT model error: Singular hessian; suspicious variable No. TRUE:
## =
## Try running with fixed shape
## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"
## iteration: 1000: Wed May 28 12:14:07 2025
##
## iteration: 2000: Wed May 28 12:16:22 2025
##
## iteration: 3000: Wed May 28 12:18:37 2025
##
## iteration: 4000: Wed May 28 12:20:51 2025
##
## iteration: 5000: Wed May 28 12:23:04 2025


print(rmses)
## $aft
## [1] NA
##
## $psbc
## $psbc$prediction
## [1] 24.8586
##
## $psbc$coefficients
## [1] 0.01872657
Monte carlo
# Parameters grid:
# p
# g
#
# MC = replicate(10000, {
#
# })